clear all 
set more off 
set maxvar 15000 
clear matrix

	use "$Mydirectory1/3_Output/2_PooledData_analysis.dta", clear 
    keep if baseline_sample==1
    
    rename wgt_sex_race weight
    
    gen non_log_father = exp(log_father_closest_census_v2) /* KEY */

    forval i=1(1)2 {
        gen est_`i'=. 
        gen est_lb_`i' =.
        gen est_ub_`i' =.
    }
     
    tempfile surveys 
    save `surveys'

*--------------------------------------*
*--------------------------------------*

*---------------------------------------*
/* BRING IN EARLY COHORT TSIV 
   RESULTS --- 1910-1930
   (DATA FROM NBER SERVER) */
*---------------------------------------*

use "$Mydirectory2/appendix_d/TSIV_results_1940_1936fix_levels_subgroups.dta", clear

    keep if est_!=.

    tempfile early
    save `early'

*--------------------------------------*
*--------------------------------------*

*-----------------------------*
* TSIV RESULTS--- 1940-1970
*-----------------------------*

    local father_inc "non_log_father"
    
* Later decades: 1940-1970 

foreach m in 1 2 3 4 {
    foreach x1 in 4 5 6 7   {

        if "`x1'"=="4" local x2 "5" //use 1950 census here
        if "`x1'"=="5" local x2 "6"
        if "`x1'"=="6" local x2 "7"
        if "`x1'"=="7" local x2 "8"
        
        noisily display "cohort 19`x1'0, group `m'"

    * Bring in corresponding Census 
        use "$CensusData/output/Census19`x2'0_fathers_ages30to50.dta", clear
        
    * Rename a couple variables
        if "`x1'"=="4" {
            clonevar HHinc = inctot
            rename slwt weight
        }
        else { 
            rename perwt weight
        }

    * Re-center weight to have mean 1
        quietly sum weight, d
        replace weight = weight/`r(mean)'
        quietly sum weight, d
        assert `r(mean)'==1

    * Append with survey data           
        append using `surveys' 
        replace census=0 if census==.
        
    * Restrict sample and set up triplets
        if "`m'"=="1" {
            keep if census==1 | (census==0 & decade==19`x1'0 & sex==1 & race==1) 
        }
        if "`m'"=="2" {
            keep if census==1 | (census==0 & decade==19`x1'0 & race==1)
        }
        if "`m'"=="3" {
            keep if census==1 | (census==0 & decade==19`x1'0 & (race==1 | (race==2 & sex==1)))
        }
        if "`m'"=="4" {
            keep if census==1 | (census==0 & decade==19`x1'0) 
        }
        
        drop if fatheroccej==99 //99 = non-working fathers
        
        local var_list "fatheroccej race south_merge" 
        egen triplet = group(`var_list')
        
        bysort census triplet: gen tag = _n==1 
        bysort triplet: egen number_samples = sum(tag) 
        tab number_samples census 
        keep if number_samples==2 //keep if triplet is in both census and surveys
        
    * Remake the dummies for the triplets that exist in both datasets
        drop triplet* 
        egen triplet = group(`var_list')
        quietly tab triplet, gen(triplet_)
        local max = `r(r)'
        local max_minus1 = `r(r)'-1 //grabs # of dummies minus 1
        drop triplet_`max'
        
    * Save the dataset we'll use for all regressions 
        tempfile restricted_data
        save `restricted_data'
        
    *------------------------------* 
        
    * TSIV without any corrections --- naive approach
        quietly reg HHinc triplet_* if  census==1 [aw=weight] 
            predict yhat if census==0 
        
        reg fam_inc_real yhat if census==0 [aw=weight], robust 

        tempfile results
        save `results'
                 
    /* TSIV correcting standard errors based on two sample uncertainty 
       Source: (https://ars.els-cdn.com/content/image/1-s2.0-S0165176516302373-mmc1.pdf) */
        
        * First stage  
        keep if census==1
        gen const=1 
        gen x1 = HHinc
        
        //robust 
        quietly reg x1 triplet_* [aw=weight], robust
        mat Vx2het = e(V)
        
        //non-robust
        quietly sureg (x1 = triplet_*) [aw=weight]
        mat Vx2hom = e(V)
        
        * Second stage 
        use `restricted_data', clear
        keep if census==0
        gen y = fam_inc_real  
        
        predict x1h, equation(x1) //generate predicted x using Census results from sureg
        
        scalar kx = 1 //number of predicted variables
        scalar ke = 1 //number of exogenous variables (constant)
        
        quietly reg y triplet_* [aw=weight]
        mat Vy1hom = e(V)*e(df_r)/_N
        
        quietly reg y triplet_* [aw=weight], robust  
        mat Vy1het = e(V)*e(df_r)/_N
        
        /* TS2SLS estimator */
        reg y x1h [aw=weight]
        scalar coeff1 = _b[x1h]
        display coeff1
        mat b2s = e(b)
        mat b2sx = b2s[1,1..kx]
        
        /* Constructing C hat */
        quietly reg triplet_1 x1h [aw=weight]
        mat ch = e(b)'
        forval b=2(1)`max_minus1' {
        quietly reg triplet_`b' x1h [aw=weight]
        mat ch = ch,e(b)'   
        }
        mat ch = ch,(J(kx,ke,0)\I(ke))
        
        * Calculating non-robust standard errors 
        mat var1hom = ch*Vy1hom*ch' + (b2sx' # ch)*Vx2hom*(b2sx # ch')
        mat seb2shom = vecdiag(cholesky(diag(vecdiag(var1hom))))'
        
        * Calculating robust standard errors 
        mat var1het = ch*Vy1het*ch' + (b2sx' # ch)*Vx2het*(b2sx # ch')
        mat seb2shet = vecdiag(cholesky(diag(vecdiag(var1het))))'
        
        mat list seb2shom
        mat list seb2shet
        
        scalar se1=seb2shet[1,1]
        display se1
        
        scalar ub1 = coeff1 + (1.96*se1)
        scalar lb1 = coeff1 - (1.96*se1)
         
        
    * Bring back full dataset; save estimates 
        use `results', clear
        replace est_1 = coeff1 
        replace est_ub_1 = ub1 
        replace est_lb_1 = lb1 
                
    * Save
        bysort decade: keep if _n==1
        drop if decade==.
        keep est_* decade 
        gen group =`m'
        
        tempfile results_`x1'_`m'
        save `results_`x1'_`m''

        } 
    }
 
* Append results   
    use `results_4_1'
    append using `results_5_1'
    append using `results_6_1'
    append using `results_7_1'
    
    forval m==2(1)4 {
        append using `results_4_`m''
        append using `results_5_`m''
        append using `results_6_`m''
        append using `results_7_`m''
    }
    
    sort decade group
    reshape long est_ est_lb_ est_ub_, i(decade group) j(estimate)
    
    replace decade= decade+1.5 if group==2
    replace decade= decade+3 if group==3
    replace decade= decade+4.5 if group==4


    append using `early'
    sort decade estimate    

* Figure
    #delimit ;
    twoway (scatter est_ decade if group==1,  msymbol(circle) mcolor(blue) msize(medium) yaxis(1)) 
           (rcap est_lb_  est_ub_  decade if group==1, lpatter(solid) lcolor(blue) yaxis(1) lwidth(0.5))
           (scatter est_ decade if group==2,  msymbol(triangle) mcolor(purple) msize(small) yaxis(1)) 
           (rcap est_lb_  est_ub_  decade if group==2, lpatter(solid) lcolor(purple) yaxis(1) lwidth(0.5) )
           (scatter est_ decade if group==3,  msymbol(circle_hollow) mcolor(red) msize(small) yaxis(1)) 
           (rcap est_lb_  est_ub_  decade if group==3, lpatter(solid) lcolor(red) yaxis(1) lwidth(0.5) )
           (scatter est_ decade if group==4,  msymbol(triangle_hollow) mcolor(orange*0.7) msize(small) yaxis(1)) 
           (rcap est_lb_  est_ub_  decade if group==4, lpatter(solid) lcolor(orange*0.7) yaxis(1) lwidth(0.5) )
           ,
    xti(" " "Decade of respondent's birth") xlabel(1920(10)1970) xscale(range(1915 1975))
    ylabel(0(.25)1.5, axis(1)) yti("Coefficient" " ", axis(1)) scheme(s1color)
    legend(on ring(0) size(medsmall) row(3) pos(8) order(1 "White men" 3 "+ White women" 5 "+ Black men" 7 "+ Black women"))
    xlabel(1920 "1920s" 1930 "1930s" 1940 "1940s" 1950 "1950s" 1960 "1960s" 1970 "1970s", labsize(small) ) ;  
    #delimit cr
    graph export "$Mydirectory2/appendix_d/Levels_TSLS_bysubgroup.pdf", as(pdf) replace
    